*************************************************   
* do-file for estimation with full-time sample	*
* fo 02.02.2016									*
*************************************************

log using "Output\analysis", replace text
set more off
use "Data\NELS_ft_imputed.dta", clear
mi svyset[pweight=F4QWT]

local background "asian latino black parses"   
local values "zigeld"
local education "score_read score_math score_scie grade_eng grade_math grade_scie fa_frau degree_ma degree_phd"
local family "single married divorced childrenxsingle nrchildren"
local work "arbzt_woch privat beruf_1-beruf_7 beruf_9-beruf_30 branche_1-branche_6 branche_8-branche_17 autonom jobtrain"

************************************************
*** 1. replicate Bobbitt-Zeher(2007) ***
************************************************

*** 1.1 replicate Bobbitt-Zeher(2007), table 1 and appendix table ***
matrix tA1 = J(78 , 8, .)
matrix rownames tA1 =	1_inc_year 2_score_read 3_score_math 4_score_scie 5_grade_eng 6_grade_math 7_grade_scie ///
	8_ugrad_gpa 9_fachgr_1 10_fachgr_2 11_fachgr_3 12_fachgr_4 13_fa_frau ///
	14_degree_ba 15_degree_ma 16_degree_phd 17_parses 18_white 19_black 20_latino 21_asian ///
	22_zigeld 23_single 24_married 25_childrenxsingle 26_nrchildren 27_arbzt_woch 28_privat ///
	29_branche 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 ///
	46_beruf 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 ///
	76_autonom 77_jobtrain 78_N

matrix colnames tA1 = 1_women_ft 2_women_ft_se 3_men_ft 4_men_ft_se 5_women 6_women_se 7_men 8_men_se

/* 	matrix for tA1
	rows: variables
	 1 - income
	 2 - test score reading 12th grade
	 3 - test score math 12th grade
	 4 - test score science 12th grade
	 5 - grade english 12th grade
	 6 - grade math 12th grade
	 7 - grade science 12th grade
	 8 - undergrad gpa
	 9 - college major: business, law, etc.
	10 - college major: math, nat. sci., engineering
	11 - college major: soc. sci., humanities
	12 - college major: education
	13 - percentage female of major
	14 - highest degree: bachelor degree
	15 - highest degree: masters degree
	16 - highest degree: doctoral degree
	17 - ses family of origin
	18 - race: white
	19 - race: black
	20 - race: latino
	21 - race: asian-american
	22 - importance of having lots of money
	23 - family char: single
	24 - family char: married or marr.like rel.
	25 - family char: single parent
	26 - number of children
	27 - number of hours worked, typical week
	28 - private sector
	29-45 - industry: agriculture
	46-75 - occupation 
	76 - job training
	77 - job autonomy
	
	columns:
	Full-time only
	 1 - women's mean
	 2 - s.e.
	 3 - men's mean
	 4 - s.e.
	Full-time and part-time
	 5 - women's mean
	 6 - s.e.
	 7 - men's mean
	 8 - s.e.
*/

local count = 0
foreach var of varlist	inc_year score_read score_math score_scie grade_eng grade_math grade_scie ///
	ugrad_gpa fachgr_1 fachgr_2 fachgr_3 fachgr_4 fa_frau ///
	degree_ba degree_ma degree_phd parses white black latino asian ///
	zigeld single married childrenxsingle nrchildren arbzt_woch privat ///
	branche_1-branche_17 ///
	beruf_1-beruf_30 ///
	autonom jobtrain {

	local count = `count' + 1
	
	dis " "
	dis " *** "
	dis "`count'"
	dis "`var'"
	dis " *** "
	dis " "
	mi estimate, post: svy: mean `var' if frau==1
		matrix tA1[`count', 1] = _b[`var']
		matrix tA1[`count', 2] = _se[`var']
	mi estimate, post: svy: mean `var' if frau==0
		matrix tA1[`count', 3] = _b[`var']
		matrix tA1[`count', 4] = _se[`var']
	}

mi estimate, post: svy: mean frau if frau==1 	
	matrix tA1[78, 1] = e(N)
mi estimate, post: svy: mean frau if frau==0 	
	matrix tA1[78, 3] = e(N)

	
***	1.2 replicate Bobbitt-Zeher(2007), table 3 ***

matrix t1_exact = J(7 , 5 , .)
matrix colnames t1_exact = 1_modelnr 2_betafemale 3_se 4_perc_expl 5_delta_expl

/* matrix for t1_exact 
	rows: models
	columns:
	1 - model number
	2 - conditional beta female
	3 - s.e.
	4 - percentage of gap explained
	*/
	
mi estimate, post: qui reg inc_year frau [pweight=F4QWT]
	local baseline 		 = _b[frau]
	matrix t1_exact[1,2] = _b[frau]
	matrix t1_exact[1,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' [pweight=F4QWT]
	matrix t1_exact[2,2] = _b[frau]
	matrix t1_exact[2,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' [pweight=F4QWT]
	matrix t1_exact[3,2] = _b[frau]
	matrix t1_exact[3,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' [pweight=F4QWT]
	matrix t1_exact[4,2] = _b[frau]
	matrix t1_exact[4,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' `family' [pweight=F4QWT]
	matrix t1_exact[5,2] = _b[frau]
	matrix t1_exact[5,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' `family' `work' [pweight=F4QWT]
	matrix t1_exact[7,2] = _b[frau]
	matrix t1_exact[7,3] = _se[frau]

* compute percentage explained *
forvalues row = 1/7	{
	matrix t1_exact[`row',1] = `row'
	matrix t1_exact[`row',4] = (`baseline' - t1_exact[`row',2]) / `baseline'
	matrix t1_exact[`row',5] = t1_exact[`row', 4] - t1_exact[`row'-1, 4] 
	}
	matrix t1_exact[7,5] = (t1_exact[7, 4] - t1_exact[5, 4])	/* model 7 should be compared with model 5, not empty row 6 */

matrix list t1_exact
putexcel set "Output\t1_exact.xlsx", replace
putexcel B2 = matrix(t1_exact, names) using "Output\t1_exact.xlsx", replace

	
*** 1.3	replicate Bobbitt-Zeher(2007), table 4 ***

mi estimate, cmdok post: oaxaca inc_year ///
	(background: parses asian latino black) ///										/* background ses */
	zigeld ///																		/* importance of having lots of money */ 	
	(scores: score_read score_math score_scie grade_eng grade_math grade_scie) /// 	/* scores */
	fa_frau ///																		/* percentage female of college major */
	(degree: degree_ma degree_phd) ///												/* doctoral degree */
	(family: single married childrenxsingle nrchildren) ///							/* family formation */
	arbzt_woch ///																	/* hours worked per week */
	(occupation: beruf_1-beruf_7 beruf_9-beruf_30)	///								/* occupation */
	(industry: branche_1-branche_6 branche_8-branche_17) ///						/* industry */
	privat ///																		/* sector */
	(otherwork: autonom jobtrain), by(frau) weight(.5) svy

est sto t2_exact
esttab  t2_exact using "Output\t2_exact.csv", wide nostar se mtitles replace


***	1.4 replicate with more reasonable family variables ***

local family "married children"
matrix t1_modif = J(7 , 5 , .)
matrix colnames t1_exact = 1_modelnr 2_betafemale 3_se 4_perc_expl 5_delta_expl

mi estimate, post: qui reg inc_year frau [pweight=F4QWT]
	local baseline 		 = _b[frau]
	matrix t1_modif[1,2] = _b[frau]
	matrix t1_modif[1,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' [pweight=F4QWT]
	matrix t1_modif[2,2] = _b[frau]
	matrix t1_modif[2,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' [pweight=F4QWT]
	matrix t1_modif[3,2] = _b[frau]
	matrix t1_modif[3,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' [pweight=F4QWT]
	matrix t1_modif[4,2] = _b[frau]
	matrix t1_modif[4,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' `family' [pweight=F4QWT]
	matrix t1_modif[5,2] = _b[frau]
	matrix t1_modif[5,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' `family' `work' [pweight=F4QWT]
	matrix t1_modif[7,2] = _b[frau]
	matrix t1_modif[7,3] = _se[frau]

* compute percentage explained *
forvalues row = 1/7	{
	matrix t1_modif[`row',1] = `row'
	matrix t1_modif[`row',4] = (`baseline' - t1_modif[`row',2]) / `baseline'
	matrix t1_modif[`row',5] = t1_modif[`row', 4] - t1_modif[`row'-1, 4] 
	}
	matrix t1_modif[7,5] = (t1_modif[7, 4] - t1_modif[5, 4])	/* model 7 should be compared with model 5, not empty row 6 */

matrix list t1_modif
putexcel set "Output\t1_modif.xlsx", replace
putexcel B2 = matrix(t1_modif, names) using "Output\t1_modif.xlsx", replace

	
*** 1.5	replicate with more reasonable family variables ***

mi estimate, cmdok post: oaxaca inc_year ///
	(background: parses asian latino black) ///										/* background ses */
	zigeld ///																		/* importance of having lots of money */ 	
	(scores: score_read score_math score_scie grade_eng grade_math grade_scie) /// 	/* scores */
	fa_frau ///																		/* percentage female of college major */
	(degree: degree_ma degree_phd) ///												/* doctoral degree */
	(family: married children) ///													/* family formation */
	arbzt_woch ///																	/* hours worked per week */
	(occupation: beruf_1-beruf_7 beruf_9-beruf_30)	///								/* occupation */
	(industry: branche_1-branche_6 branche_8-branche_17) ///						/* industry */
	privat ///																		/* sector */
	(otherwork: autonom jobtrain), by(frau) weight(.5) svy

est sto t2_modif
esttab  t2_modif using "Output\t2_modif.csv", wide nostar se mtitles replace


************************************************************************
*** 2. correct for endogeneous sample selection and misspecification ***
************************************************************************

* add part time and non full-year workers *
append using "Data\NELS_pt_imputed.dta", gen(source)
mi svyset[pweight=F4QWT]


*** 2.0 Demonstrate that sample restriction is endogeneous to motherhood ***
	
* matrix for hrswork * 
matrix tA3 = J(2, 8, .)
matrix rownames tA3 = ftyr both
matrix colnames tA3 = mothers_b mothers_cilb mothers_ciub mothers_N fathers_b fathers_cilb fathers_ciub fathers_N

* full-time and year-round workers *
mi estimate, post: svy: mean arbzt_woch if frau == 1 & children == 1 & source == 0
		matrix table = r(table)
		matrix tA3[1,1] = table[1,1]
		matrix tA3[1,2] = table[5,1]
		matrix tA3[1,3] = table[6,1]
		matrix tA3[1,4] = e(N)
		
mi estimate, post: svy: mean arbzt_woch if frau == 0 & children == 1 & source == 0
		matrix table = r(table)
		matrix tA3[1,5] = table[1,1]
		matrix tA3[1,6] = table[5,1]
		matrix tA3[1,7] = table[6,1]	
		matrix tA3[1,8] = e(N)
		
* both samples *		
mi estimate, post: svy: mean arbzt_woch if frau == 1 & children == 1 
		matrix table = r(table)
		matrix tA3[2,1] = table[1,1]
		matrix tA3[2,2] = table[5,1]
		matrix tA3[2,3] = table[6,1]
		matrix tA3[2,4] = e(N)
		
mi estimate, post: svy: mean arbzt_woch if frau == 0 & children == 1 
		matrix table = r(table)
		matrix tA3[2,5] = table[1,1]
		matrix tA3[2,6] = table[5,1]
		matrix tA3[2,7] = table[6,1]
		matrix tA3[2,8] = e(N)
		
save "Data\temporary.dta", replace
mat list tA3
svmat tA3, names(col)
keep mothers_b-fathers_N
keep if _n <= 2
gen sample = .
replace sample = 0 if _n == 1	/* restricted */   
replace sample = 1 if _n == 2	/* unrestricted */
format %4.1f mothers_b fathers_b
format %3.0f mothers_N fathers_N
gen mothers_Nstr = "n = " + string(mothers_N)
gen fathers_Nstr = "n = " + string(fathers_N)

twoway	(scatter sample mothers_b, msize(huge) mcolor(orange_red) ///
 					mlabel(mothers_b) mlabgap(3) mlabcolor(black) mlabposition(12) mlabsize(vlarge)) ///
		(scatter sample mothers_b, msize(huge) mcolor(orange_red) ///
 					mlabel(mothers_Nstr) mlabgap(3) mlabcolor(black) mlabposition(6) mlabsize(large)) ///
		(rcap mothers_cilb mothers_ciub sample, horizontal msize(huge) lcolor(orange_red)) ///
		(scatter sample fathers_b, msize(huge) mcolor(ebblue) msymbol(diamond) ///
					mlabel(fathers_b) mlabgap(3) mlabcolor(black) mlabposition(12) mlabsize(vlarge)) ///
		(scatter sample fathers_b, msize(huge) mcolor(ebblue) msymbol(diamond) ///
					mlabel(fathers_Nstr) mlabgap(3) mlabcolor(black) mlabposition(6) mlabsize(large)) ///
					(rcap fathers_cilb fathers_ciub sample, horizontal msize(huge) lcolor(ebblue)), ///
		ysize(1) 	ylabel(0 "restricted" 1 "unrestricted", labsize(medlarge)) yscale(lcolor(gs14) range(-.4 1.4)) ///
		ytitle("sample") ///
		xsize(2.4) 	xlabel(35[1]50, labsize(medlarge) format(%2.0f)) xscale(lcolor(gs14)) xtick(35[1]50, grid glcolor(white)) ///
		plotregion(color(gs14)) graphregion(color(white)) ///
		legend(rows(1) label(1 "Mothers") label(3 "95% confidence interval") label(4 "Fathers") label(6 "95% confidence interval") region(lcolor(white)))
		graph save Graph "Output\hrswork.gph", replace
		

*** 2.1 Descriptives for full sample ***
use "Data\temporary.dta", clear

local count = 0
foreach var of varlist	inc_year score_read score_math score_scie grade_eng grade_math grade_scie ///
	ugrad_gpa fachgr_1 fachgr_2 fachgr_3 fachgr_4 fa_frau ///
	degree_ba degree_ma degree_phd parses white black latino asian ///
	zigeld single married childrenxsingle nrchildren arbzt_woch privat ///
	branche_1-branche_17 ///
	beruf_1-beruf_30 ///
	autonom jobtrain {

	local count = `count' + 1
	
	dis " "
	dis " *** "
	dis "`count'"
	dis "`var'"
	dis " *** "
	dis " "
	mi estimate, post: svy: mean `var' if frau==1
		matrix tA1[`count', 5] = _b[`var']
		matrix tA1[`count', 6] = _se[`var']
	mi estimate, post: svy: mean `var' if frau==0
		matrix tA1[`count', 7] = _b[`var']
		matrix tA1[`count', 8] = _se[`var']
	}

mi estimate, post: svy: mean frau if frau==1 	
	matrix tA1[78, 5] = e(N)
mi estimate, post: svy: mean frau if frau==0 	
	matrix tA1[78, 7] = e(N)
	
matrix list tA1
putexcel set "Output\tA1.xlsx", replace
putexcel B2 = matrix(tA1, names) using "Output\tA1.xlsx", replace	


*** 2.2 Bobbitt-Zeher(2007), sequential decomposition with correction for moderation of family effect ***

matrix t1_corr = J(7 , 5 , .)
matrix colnames t1_corr = 1_modelnr 2_betafemale 3_se 4_perc_expl 5_delta_expl
/* matrix for t3_exact 
	rows: models
	columns:
	1 - model number
	2 - conditional beta female
	3 - s.e.
	4 - percentage of gap explained
	*/
mi estimate, post: qui reg inc_year frau [pweight=F4QWT]
	local baseline 		  = _b[frau]
	matrix t1_corr[1,2] = _b[frau]
	matrix t1_corr[1,3] = _se[frau]	
mi estimate, post: qui reg inc_year frau `background' [pweight=F4QWT]
	matrix t1_corr[2,2] = _b[frau]
	matrix t1_corr[2,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' [pweight=F4QWT]
	matrix t1_corr[3,2] = _b[frau]
	matrix t1_corr[3,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' [pweight=F4QWT]
	matrix t1_corr[4,2] = _b[frau]
	matrix t1_corr[4,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' married children [pweight=F4QWT]
	matrix t1_corr[5,2] = _b[frau]
	matrix t1_corr[5,3] = _se[frau]
/* mi estimate, post: qui reg inc_year frau `background' `values' `education' married children childrenxfrau [pweight=F4QWT]
	matrix t1_corr[6,2] = _b[frau]
	matrix t1_corr[6,3] = _se[frau] */
mi estimate, post: qui reg inc_year frau `background' `values' `education' married children `work' [pweight=F4QWT]
	matrix t1_corr[7,2] = _b[frau]
	matrix t1_corr[7,3] = _se[frau]

* compute percentage explained *
forvalues row = 1/7	{
	matrix t1_corr[`row',1] = `row'
	matrix t1_corr[`row',4] = (`baseline' - t1_corr[`row',2]) / `baseline'
	matrix t1_corr[`row',5] = (t1_corr[`row', 4] - t1_corr[`row'-1, 4])
	}
	matrix t1_corr[6,5] = (t1_corr[6, 4] - t1_corr[4, 4])	/* model 5b should be compared with model 4, not model 5 */

matrix list t1_corr
putexcel set "Output\t1_corr.xlsx", replace
putexcel B2 = matrix(t1_corr, names) using "Output\t1_corr.xlsx", replace

*** 2.2 Bobbitt-Zeher(2007), sequential decomposition with correction for moderation of family effect ***

matrix t1_corr_b = J(7 , 5 , .)
matrix colnames t1_corr_b = 1_modelnr 2_betafemale 3_se 4_perc_expl 5_delta_expl
/* matrix for t3_exact 
	rows: models
	columns:
	1 - model number
	2 - conditional beta female
	3 - s.e.
	4 - percentage of gap explained
	*/
mi estimate, post: qui reg inc_year frau [pweight=F4QWT]
	local baseline 		  = _b[frau]
	matrix t1_corr_b[1,2] = _b[frau]
	matrix t1_corr_b[1,3] = _se[frau]	
mi estimate, post: qui reg inc_year frau `background' [pweight=F4QWT]
	matrix t1_corr_b[2,2] = _b[frau]
	matrix t1_corr_b[2,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' [pweight=F4QWT]
	matrix t1_corr_b[3,2] = _b[frau]
	matrix t1_corr_b[3,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' [pweight=F4QWT]
	matrix t1_corr_b[4,2] = _b[frau]
	matrix t1_corr_b[4,3] = _se[frau]
/* mi estimate, post: qui reg inc_year frau `background' `values' `education' married children [pweight=F4QWT]
	matrix t1_corr_b[5,2] = _b[frau]
	matrix t1_corr_b[5,3] = _se[frau] */
mi estimate, post: qui reg inc_year frau `background' `values' `education' married children marriedxfrau childrenxfrau [pweight=F4QWT]
	matrix t1_corr_b[6,2] = _b[frau]
	matrix t1_corr_b[6,3] = _se[frau]
mi estimate, post: qui reg inc_year frau `background' `values' `education' married children marriedxfrau childrenxfrau `work' [pweight=F4QWT]
	matrix t1_corr_b[7,2] = _b[frau]
	matrix t1_corr_b[7,3] = _se[frau]

* compute percentage explained *
forvalues row = 1/7	{
	matrix t1_corr_b[`row',1] = `row'
	matrix t1_corr_b[`row',4] = (`baseline' - t1_corr_b[`row',2]) / `baseline'
	matrix t1_corr_b[`row',5] = (t1_corr_b[`row', 4] - t1_corr_b[`row'-1, 4])
	}
	matrix t1_corr_b[6,5] = (t1_corr_b[6, 4] - t1_corr_b[4, 4])	/* model 5b should be compared with model 4, not model 5 */

matrix list t1_corr_b
putexcel set "Output\t1_corr_b.xlsx", replace
putexcel B2 = matrix(t1_corr_b, names) using "Output\t1_corr_b.xlsx", replace


*** 2.3 Bobbitt-Zeher(2007), OB-decomposition with correction for misattribution of family effect ***

mi estimate, cmdok post: oaxaca inc_year ///
	(background: parses asian latino black) ///										/* background ses */
	zigeld ///																		/* importance of having lots of money */ 	
	(scores: score_read score_math score_scie grade_eng grade_math grade_scie) /// 	/* scores */
	fa_frau ///																		/* percentage female of college major */
	(degree: degree_ma degree_phd) ///												/* doctoral degree */
	(family: married children) ///							/* family formation */
	arbzt_woch ///																	/* hours worked per week */
	(occupation: beruf_1-beruf_7 beruf_9-beruf_30)	///								/* occupation */
	(industry: branche_1-branche_6 branche_8-branche_17) ///						/* industry */
	privat ///																		/* sector */
	(otherwork: autonom jobtrain), by(frau) weight(.5) svy relax

est sto t2_corr
esttab  t2_corr using "Output\t2_corr.csv", wide nostar se mtitles replace


*** 2.4 Bobbitt-Zeher(2007), OB-decomposition with correction for misattribution of family effect, no intervening work variables ***

mi estimate, cmdok post: oaxaca inc_year ///
	(background: parses asian latino black) ///										/* background ses */
	zigeld ///																		/* importance of having lots of money */ 	
	(scores: score_read score_math score_scie grade_eng grade_math grade_scie) /// 	/* scores */
	fa_frau ///																		/* percentage female of college major */
	(degree: degree_ma degree_phd) ///												/* doctoral degree */
	(family: married children), by(frau) weight(.5) svy relax	 ///				/* family formation */

est sto t2_corr_b
esttab  t2_corr_b using "Output\t2_corr_b.csv", wide nostar se mtitles replace

log close
*** END OF DOFILE ***
